library(tidyverse)
library(magrittr)
library(parallel)
library(ngsReports)
library(here)
library(scales)
library(ggpubr)
library(kableExtra)
library(AnnotationHub)
library(ensembldb)
library(edgeR)
library(corrplot)
library(DT)
library(ggrepel)
library(msigdbr)
library(fgsea)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- detectCores() - 2
ah <- AnnotationHub() %>%
subset(species == "Homo sapiens") %>%
subset(rdataclass == "EnsDb")
ensDb <- ah[["AH83216"]]
trEns <- transcripts(ensDb) %>%
mcols() %>%
as_tibble()
trLen <- exonsBy(ensDb, "tx") %>%
width() %>%
vapply(sum, integer(1))
geneGcLen <- trLen %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns) %>%
group_by(gene_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
trGcLen <- trLen %>%
enframe() %>%
set_colnames(c("tx_id", "length")) %>%
left_join(trEns) %>%
group_by(tx_id) %>%
summarise(
aveLen = mean(length),
maxLen = max(length),
aveGc = sum(length * gc_content) / sum(length),
longestGc = gc_content[which.max(length)[[1]]]
) %>%
mutate(
aveGc = aveGc / 100,
longestGc = longestGc / 100
)
genesGR <- genes(ensDb)
mcols(genesGR) <- mcols(genesGR)[c("gene_id", "gene_name",
"gene_biotype", "entrezid")]
txGR <- transcripts(ensDb)
mcols(txGR) <- mcols(txGR)[c("tx_id", "tx_name",
"tx_biotype", "tx_id_version", "gene_id")]
An EnsDb object was obtained for Ensembl release 101 using the AnnotationHub package. This provided the GC content and length for every gene and transcript in the release. For humans, this consists of 67990 genes and 252335 transcripts.
files <- list.files(
path = "/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/0_rawData/FastQC",
pattern = "zip",
full.names = TRUE
)
samples <- tibble(
sample = str_remove(basename(files), "_fastqc.zip"),
dataset = NA,
organism = NA
) %>%
mutate(
dataset = "StrippedSerum",
organism = "human"
)
datasets <- samples$dataset %>%
unique()
The following analysis involves 16 paired-end samples across 1 dataset(s): StrippedSerum.
rawFqc <- files %>%
FastqcDataList()
data <- grep("GLL", fqName(rawFqc))
labels <- rawFqc[data] %>%
fqName() %>%
str_remove("_001\\.fastq\\.gz") %>%
str_remove("Ps2Ex3M1_")
rawLib <- plotReadTotals(rawFqc[data]) +
labs(subtitle = "StrippedSerum") +
scale_x_discrete(labels = labels)
The library sizes of the unprocessed dataset(s) range between 22,057,776 and 42,743,934 reads.
rawLib
rRNA transcripts are known to have high GC content. Therefore, inspecting the GC content of the raw reads is a logical start point for detecting incomplete rRNA removal. A spike in GC content at ~ 70% is expected if this is the case.
plotly::ggplotly(
plotGcContent(
x = rawFqc[data],
plotType = "line",
gcType = "Transcriptome",
species = "Hsapiens"
) +
labs(title = "StrippedSerum Dataset (H. sapiens)") +
theme(legend.position="none")
)
GC content of reads in the dataset. Clear spikes at about 70% GC are observed, which is likely due to incomplete rRNA depletion.
getModule(rawFqc, "Overrep") %>%
group_by(Sequence, Possible_Source) %>%
summarise(`Found In` = n(), `Highest Percentage` = max(Percentage)) %>%
arrange(desc(`Highest Percentage`), desc(`Found In`)) %>%
ungroup() %>%
dplyr::slice(1:30) %>%
mutate(`Highest Percentage` = percent_format(0.01)(`Highest Percentage`/100)) %>%
kable(
align = "llrr",
caption = paste(
"Top", nrow(.),"Overrepresented sequences.",
"The number of samples they were found in is shown,",
"along with the percentage of the most 'contaminated' sample."
)
) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed", "responsive")
)
| Sequence | Possible_Source | Found In | Highest Percentage |
|---|---|---|---|
| CCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTGATG | No Hit | 16 | 0.84% |
| CTGGAGTCTTGGAAGCTTGACTACCCTACGTTCTCCTACAAATGGACCTT | No Hit | 16 | 0.60% |
| CTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTGA | No Hit | 16 | 0.59% |
| CCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTAC | No Hit | 8 | 0.55% |
| CCCCTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATAT | No Hit | 16 | 0.50% |
| CCAGGCTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATCA | No Hit | 13 | 0.50% |
| CCCTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATT | No Hit | 16 | 0.49% |
| CTTGGTTATAATTTTTCATCTTTCCCTTGCGGTACTATATCTATTGCGCC | No Hit | 16 | 0.49% |
| CTCCGTTTCCGACCTGGGCCGGTTCACCCCTCCTTAGGCAACCTGGTGGT | No Hit | 16 | 0.47% |
| CTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTGATGC | No Hit | 15 | 0.45% |
| CCCTGTTCTTGGGTGGGTGTGGGTATAATGCTAAGTTGAGATGATATCAT | No Hit | 8 | 0.41% |
| AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA | No Hit | 1 | 0.40% |
| CCCAAACCCACTCCACCCTACTACCAGACAACCTTAGCCAAACCATTTAC | No Hit | 8 | 0.39% |
| CCCTGTTCTTGGGTGGGTGTGGGTATAATACTAAGTTGAGATGATATCAT | No Hit | 8 | 0.39% |
| GTATAATACTAAGTTGAGATGATATCATTTACGGGGGAAGGCGCTTTGTG | No Hit | 8 | 0.39% |
| CGGGGGAAGGCGCTTTGTGAAGTAGGCCTTATTTCTCTTGTCCTTTCGTA | No Hit | 16 | 0.36% |
| GTGGGTATAATGCTAAGTTGAGATGATATCATTTACGGGGGAAGGCGCTT | No Hit | 8 | 0.33% |
| CTCTCTACAAGGTTTTTTCCTAGTGTCCAAAGAGCTGTTCCTCTTTGGAC | No Hit | 16 | 0.33% |
| CTTATTTCTCTTGTCCTTTCGTACAGGGAGGAATTTGAAGTAGATAGAAA | No Hit | 16 | 0.32% |
| GGGTATAATACTAAGTTGAGATGATATCATTTACGGGGGAAGGCGCTTTG | No Hit | 8 | 0.31% |
| CTCAGACCGCGTTCTCTCCCTCTCACTCCCCAATACGGAGAGAAGAACGA | No Hit | 12 | 0.31% |
| GTAAGATTTGCCGAGTTCCTTTTACTTTTTTTAACCTTTCCTTATGGGCA | No Hit | 8 | 0.31% |
| CTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACT | No Hit | 16 | 0.29% |
| GTCCAATTGGGTGTGAGGAGTTCAGTTATATGTTTGGGATTTTTTAGGTA | No Hit | 12 | 0.28% |
| CCTCCTTAGGCAACCTGGTGGTCCCCCGCTCCCGGGAGGTCACCATATTG | No Hit | 16 | 0.28% |
| CTGGTTTCGGGGGTCTTAGCTTTGGCTCTCCTTGCAAAGTTATTTCTAGT | No Hit | 15 | 0.28% |
| CTCTAGAATAGGATTGCGCTGTTATCCCTAGGGTAACTTGTTCCGTTGGT | No Hit | 16 | 0.26% |
| CTGTTCTTGGGTGGGTGTGGGTATAATACTAAGTTGAGATGATATCATTT | No Hit | 5 | 0.25% |
| CTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGG | No Hit | 16 | 0.25% |
| CTCAGGCTGGAGTGCAGTGGCTATTCACAGGCGCGATCCCACTACTGATC | No Hit | 13 | 0.25% |
Raw libraries were trimmed using cutadapt v1.14 to remove Illumina adapter sequences. Bases with PHRED score < 30, NextSeq-induced polyG runs and reads shorter than 35bp were also removed.
trimFqc <- list.files(
path = "/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/1_trimmedData/FastQC",
pattern = "zip",
full.names = TRUE
) %>%
FastqcDataList()
trimStats <- readTotals(rawFqc) %>%
dplyr::rename(Raw = Total_Sequences) %>%
left_join(readTotals(trimFqc), by = "Filename") %>%
dplyr::rename(Trimmed = Total_Sequences) %>%
mutate(
Discarded = 1 - Trimmed/Raw,
Retained = Trimmed / Raw
)
After trimming of adapters between 6.09% and 8.36% of reads were discarded.
Trimmed reads were:
Aligned to rRNA sequences using the BWA-MEM algorithm to estimate the proportion of reads that were of rRNA origin within each sample. BWA-MEM is recommended for high-quality queries of reads ranging from 70bp to 1Mbp as it is faster and more accurate that alternative algorithms BWA-backtrack and BWA-SW.
Aligned to the Homo sapiens GRCh38 genome (Ensembl release 101) using STAR v2.7.0d and summarised with featureCounts from the Subread v1.5.2 package. These counts were used for all gene-level analysis.
rRnaProp <- read.delim(
"/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/3_bwa/log/samples.mapped.all",
sep = ":",
col.names = c("sample", "proportion"),
header = FALSE
) %>%
mutate(
sample = str_remove_all(sample, ".mapped"),
sample = basename(sample),
proportion = proportion/100,
dataset = "StrippedSerum",
organism = "human"
) %>%
as_tibble()
rRnaProp$dataset %<>%
factor(levels = c("StrippedSerum"))
rRnaProp %>%
ggplot(aes(sample, proportion)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~dataset, scales = "free_x") +
scale_y_continuous(labels = percent) +
labs(x = "Sample", y = "Percent of Total", fill = "Read pair") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Percentages of each library that align to rRNA sequences with bwa mem.
dgeList <- read_tsv("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/4_star2pass/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam")) %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
metaData <- read_tsv("/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/T47D_ZR75_DHT_StrippedSerum.tsv")
dgeList$genes <- genesGR[rownames(dgeList),]
mcols(dgeList$genes) %<>%
as.data.frame() %>%
left_join(geneGcLen)
addInfo <- tibble(
sample = rRnaProp$sample,
dataset = "StrippedSerum",
organism = "human",
rRNA = rRnaProp$proportion
) %>%
left_join(metaData)
dgeList$samples %<>%
rownames_to_column("rowname") %>%
mutate(sample = rowname) %>%
left_join(addInfo) %>%
column_to_rownames("rowname") %>%
mutate(
filenames = paste0(
"/hpcfs/users/a1647910/20200310_rRNADepletion/",
"4_T47D_ZR75_DHT_StrippedSerum/4_star2pass/bam/",
sample,
"Aligned.sortedByCoord.out.bam"
)
)
dgeList$samples$filenames <- list.files(
"/hpcfs/users/a1647910/20200310_rRNADepletion/4_T47D_ZR75_DHT_StrippedSerum/4_star2pass/bam",
pattern = ".bam$",
full.names = TRUE
)
dgeList$samples$group <- paste0(
dgeList$samples$cell_line, "_", dgeList$samples$treat
) %>%
make.names() %>%
factor(levels = unique(.))
gcInfo <- function(x) {
x$counts %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
as_tibble() %>%
pivot_longer(
cols = colnames(x),
names_to = "sample",
values_to = "counts"
) %>%
dplyr::filter(
counts > 0
) %>%
left_join(
geneGcLen
) %>%
dplyr::select(
ends_with("id"), sample, counts, aveGc, maxLen
) %>%
split(f = .$sample) %>%
lapply(
function(x){
DataFrame(
gc = Rle(x$aveGc, x$counts),
logLen = Rle(log10(x$maxLen), x$counts)
)
}
)
}
gcSummary <- function(x) {
x %>%
vapply(function(x){
c(mean(x$gc), sd(x$gc), mean(x$logLen), sd(x$logLen))
}, numeric(4)
) %>%
t() %>%
set_colnames(
c("mn_gc", "sd_gc", "mn_logLen", "sd_logLen")
) %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble()
}
rle <- gcInfo(dgeList)
sumGc <- gcSummary(rle)
a <- sumGc %>%
left_join(dgeList$samples) %>%
ggplot(aes(rRNA, mn_logLen)) +
geom_point(aes(colour = treat, shape = cell_line), size = 3) +
geom_smooth(method = "lm") +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean log(Length)",
colour = "Treatment",
shape = "Cell line"
)
b <- sumGc %>%
left_join(dgeList$samples) %>%
ggplot(aes(rRNA, mn_gc)) +
geom_point(aes(colour = treat, shape = cell_line), size = 3) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = percent) +
labs(
x = "rRNA Proportion of Initial Library",
y = "Mean GC Content",
colour = "Treatment",
shape = "Cell line"
)
ggarrange(
a, b, ncol = 2, nrow = 1,
common.legend = TRUE, legend = "bottom"
) %>%
annotate_figure("StrippedSerum Dataset (H. sapiens)")
Comparison of residual bias potentially introduced by incomplete rRNA removal. Regression lines are shown along with standard error bands for each comparison.
genes2keep <- dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(6)
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
pca <- cpm(dgeFilt, log = TRUE) %>%
t() %>%
prcomp()
pcaCor <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
as_tibble() %>%
left_join(dgeList$samples) %>%
dplyr::select(
PC1, PC2, PC3,
Mean_GC = mn_gc,
Mean_Length = mn_logLen,
rRna_Proportion = rRNA
) %>%
cor()
a <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(dgeList$samples) %>%
as_tibble() %>%
ggplot(aes(PC1, PC2)) +
geom_point(aes(colour = treat, shape = cell_line), size = 2) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = paste0("PC2 (", percent(summary(pca)$importance["Proportion of Variance","PC2"]),")"),
colour = "Treatment",
shape = "Cell line"
)
b <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(dgeList$samples) %>%
ggplot(aes(PC1, rRNA, label = rRNA)) +
geom_point(aes(colour = treat, shape = cell_line), size = 2) +
geom_smooth(method = "lm") +
geom_text_repel(show.legend = FALSE) +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "rRNA Proportion of Initial Library",
colour = "Treatment",
shape = "Cell line"
)
c <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
left_join(dgeList$samples) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_logLen)) +
geom_point(aes(colour = treat, shape = cell_line), size = 2) +
geom_smooth(method = "lm") +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean log(Length)",
colour = "Treatment",
shape = "Cell line"
)
d <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
left_join(sumGc) %>%
left_join(dgeList$samples) %>%
as_tibble() %>%
ggplot(aes(PC1, mn_gc)) +
geom_point(aes(colour = treat, shape = cell_line), size = 2) +
geom_smooth(method = "lm") +
scale_y_continuous(labels = percent) +
labs(
x = paste0("PC1 (", percent(summary(pca)$importance["Proportion of Variance","PC1"]),")"),
y = "Mean GC",
colour = "Treatment",
shape = "Cell line"
)
ggarrange(
a, b, c, d, ncol = 2, nrow = 2,
common.legend = TRUE, legend = "bottom"
) %>%
annotate_figure("StrippedSerum")
PCA plot showing rRNA proportion, mean GC content and mean log(length) after summarisation to gene-level.
corrplot(
pcaCor,
type = "lower",
diag = FALSE,
addCoef.col = 1, addCoefasPercent = TRUE
)
Correlations between the first three principal components and measured variables: mean GC content, mean log(length) and rRNA proportion.
displayRes_de <- function(x){
de <- x %>%
dplyr::filter(DE)
de %>%
dplyr::slice(1:1000) %>%
dplyr::select(-gene_biotype, -coef, -DE) %>%
mutate(across(c("P.Value", "FDR", "Bonf"), ~ sprintf("%.2e", .x))) %>%
datatable(
options = list(pageLength = 10),
class = "striped hover condensed responsive",
filter = "top",
caption = paste0(
x$coef[1],
": ",
nrow(de),
" of ",
nrow(x),
" genes were classified as differentially expressed ",
"with a FDR < 0.05. ",
"If more than 1000 genes were classified as DE, only the top 1000 are shown."
)
) %>%
formatRound(c("logFC", "logCPM", "F"), digits = 2)
}
design_r <- model.matrix(~rRNA, data = dgeFilt$samples)
fit_r <- dgeFilt %>%
estimateDisp(design = design_r) %>%
glmQLFit()
res_r <- glmQLFTest(fit_r) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
rename_all(
str_remove, pattern = "ID."
) %>%
dplyr::select(
Geneid = gene_id, Symbol = gene_name, gene_biotype, logFC, logCPM, F,
P.Value = PValue, FDR, aveLen, maxLen, aveGc, longestGc
) %>%
as_tibble() %>%
mutate(
Bonf = p.adjust(P.Value, "bonf"),
DE = FDR < 0.05
)
res_r %>%
dplyr::filter(DE) %>%
dplyr::slice(1:1000) %>%
dplyr::select(-gene_biotype, -DE, -aveLen, -maxLen, -aveGc, -longestGc) %>%
mutate(across(c("P.Value", "FDR", "Bonf"), ~ sprintf("%.2e", .x))) %>%
datatable(
options = list(pageLength = 10),
class = "striped hover condensed responsive",
filter = "top",
caption = paste0(
"rRNA: ",
nrow(dplyr::filter(res_r, DE)),
" of ",
nrow(res_r),
" genes were classified as differentially expressed ",
"with a FDR < 0.05. ",
"If more than 1000 genes were classified as DE, only the top 1000 are shown."
)
) %>%
formatRound(c("logFC", "logCPM", "F"), digits = 2)
design_t <- model.matrix(~0+group, data = dgeFilt$samples)
cont_t <- makeContrasts(
groupT.47D_DHT = groupT.47D_DHT - groupT.47D_Vehicle,
groupZR.75.1_DHT = groupZR.75.1_DHT - groupZR.75.1_Vehicle,
levels = colnames(design_t)
)
fit_t <- dgeFilt %>%
estimateDisp(design = design_t) %>%
glmQLFit()
res_t <- colnames(cont_t) %>%
sapply(function(x){
glmQLFTest(fit_t, contrast = cont_t[,x]) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
rename_all(
str_remove, pattern = "ID."
) %>%
dplyr::select(
Geneid = gene_id, Symbol = gene_name, gene_biotype, logFC, logCPM, F,
P.Value = PValue, FDR,
) %>%
as_tibble() %>%
mutate(
Bonf = p.adjust(P.Value, "bonf"),
coef = x,
DE = FDR < 0.05
)
},
simplify = FALSE)
displayRes_de(res_t$groupT.47D_DHT)
displayRes_de(res_t$groupZR.75.1_DHT)
design_tr <- model.matrix(~0+rRNA+group, data = dgeFilt$samples)
cont_tr <- makeContrasts(
groupT.47D_DHT = groupT.47D_DHT - groupT.47D_Vehicle,
groupZR.75.1_DHT = groupZR.75.1_DHT - groupZR.75.1_Vehicle,
rRNA = rRNA,
levels = colnames(design_tr)
)
fit_tr <- dgeFilt %>%
estimateDisp(design = design_tr) %>%
glmQLFit()
res_tr <- colnames(cont_tr) %>%
sapply(function(x){
glmQLFTest(fit_tr, contrast = cont_tr[,x]) %>%
topTags(n = Inf) %>%
.[["table"]] %>%
rename_all(
str_remove, pattern = "ID."
) %>%
dplyr::select(
Geneid = gene_id, Symbol = gene_name, gene_biotype, logFC, logCPM, F,
P.Value = PValue, FDR,
) %>%
as_tibble() %>%
mutate(
Bonf = p.adjust(P.Value, "bonf"),
coef = x,
DE = FDR < 0.05
)
},
simplify = FALSE)
displayRes_de(res_tr$rRNA)
displayRes_de(res_tr$groupT.47D_DHT)
displayRes_de(res_tr$groupZR.75.1_DHT)
entrezGenes <- mcols(dgeList$genes) %>%
as.data.frame() %>%
dplyr::filter(!is.na(entrezid)) %>%
unnest(cols = entrezid) %>%
dplyr::rename(entrez_gene = entrezid)
ranks_r <- res_r %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
dplyr::arrange(stat) %>%
with(structure(stat, names = Geneid))
ranks_t <- res_t %>%
lapply(function(x){
x %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
dplyr::arrange(stat) %>%
with(structure(stat, names = Geneid))
})
ranks_tr <- res_tr %>%
lapply(function(x){
x %>%
mutate(stat = -sign(logFC) * log10(P.Value)) %>%
dplyr::arrange(stat) %>%
with(structure(stat, names = Geneid))
})
displayRes_enrich <- function(x, cap){
x %>%
unnest(cols = leadingEdge) %>%
group_by(pathway) %>%
mutate(
leadingSize = n(),
pathway = str_remove(pathway, "HALLMARK_|KEGG_|WP_"),
pathway = str_trunc(pathway, 33)
) %>%
distinct(pathway, .keep_all = TRUE) %>%
dplyr::select(-leadingEdge) %>%
mutate(across(c("pval", "FDR", "padj"), ~ sprintf("%.2e", .x))) %>%
datatable(
options = list(pageLength = 10),
class = "striped hover condensed responsive",
filter = "top",
caption = paste(cap)
) %>%
formatRound(c("log2err", "ES", "NES"), digits = 2)
}
hm <- msigdbr("Homo sapiens", category = "H") %>%
left_join(entrezGenes) %>%
dplyr::filter(!is.na(gene_id)) %>%
distinct(gs_name, gene_id, .keep_all = TRUE)
hmByGene <- hm %>%
split(f = .$gene_id) %>%
lapply(extract2, "gs_name")
hmByID <- hm %>%
split(f = .$gs_name) %>%
lapply(extract2, "gene_id")
kg <- msigdbr("Homo sapiens", category = "C2", subcategory = "CP:KEGG") %>%
left_join(entrezGenes) %>%
dplyr::filter(!is.na(gene_id)) %>%
distinct(gs_name, gene_id, .keep_all = TRUE)
kgByGene <- kg %>%
split(f = .$gene_id) %>%
lapply(extract2, "gs_name")
kgByID <- kg %>%
split(f = .$gs_name) %>%
lapply(extract2, "gene_id")
wk <- msigdbr("Homo sapiens", category = "C2", subcategory = "CP:WIKIPATHWAYS") %>%
left_join(entrezGenes) %>%
dplyr::filter(!is.na(gene_id)) %>%
distinct(gs_name, gene_id, .keep_all = TRUE)
wkByGene <- wk %>%
split(f = .$gene_id) %>%
lapply(extract2, "gs_name")
wkByID <- wk %>%
split(f = .$gs_name) %>%
lapply(extract2, "gene_id")
gsSizes <- bind_rows(hm, kg, wk) %>%
dplyr::select(gs_name, gene_symbol, gene_id) %>%
chop(c(gene_symbol, gene_id)) %>%
mutate(
gs_size = vapply(gene_symbol, length, integer(1))
)
fgsea_r_hm <-fgsea(hmByID, ranks_r, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
displayRes_enrich(fgsea_r_hm, "rRNA")
fgsea_t_hm <- ranks_t %>%
lapply(function(x){
fgsea(hmByID, x, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
})
displayRes_enrich(fgsea_t_hm$groupT.47D_DHT, "T-47D_DHT")
displayRes_enrich(fgsea_t_hm$groupZR.75.1_DHT, "ZR-75-1_DHT")
fgsea_tr_hm <- ranks_tr %>%
lapply(function(x){
fgsea(hmByID, x, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
})
displayRes_enrich(fgsea_tr_hm$rRNA, "rRNA")
displayRes_enrich(fgsea_tr_hm$groupT.47D_DHT, "T-47D_DHT")
displayRes_enrich(fgsea_tr_hm$groupZR.75.1_DHT, "ZR-75-1_DHT")
fgsea_r_kg <-fgsea(kgByID, ranks_r, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
displayRes_enrich(fgsea_r_kg, "rRNA")
fgsea_t_kg <- ranks_t %>%
lapply(function(x){
fgsea(kgByID, x, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
})
displayRes_enrich(fgsea_t_kg$groupT.47D_DHT, "T-47D_DHT")
displayRes_enrich(fgsea_t_kg$groupZR.75.1_DHT, "ZR-75-1_DHT")
fgsea_tr_kg <- ranks_tr %>%
lapply(function(x){
fgsea(kgByID, x, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
})
displayRes_enrich(fgsea_tr_kg$rRNA, "rRNA")
displayRes_enrich(fgsea_tr_kg$groupT.47D_DHT, "T-47D_DHT")
displayRes_enrich(fgsea_tr_kg$groupZR.75.1_DHT, "ZR-75-1_DHT")
fgsea_r_wk <-fgsea(wkByID, ranks_r, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
displayRes_enrich(fgsea_r_wk, "rRNA")
fgsea_t_wk <- ranks_t %>%
lapply(function(x){
fgsea(wkByID, x, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
})
displayRes_enrich(fgsea_t_wk$groupT.47D_DHT, "T-47D_DHT")
displayRes_enrich(fgsea_t_wk$groupZR.75.1_DHT, "ZR-75-1_DHT")
fgsea_tr_wk <- ranks_tr %>%
lapply(function(x){
fgsea(wkByID, x, eps = 0) %>%
as_tibble() %>%
dplyr::rename(FDR = padj) %>%
mutate(padj = p.adjust(pval, "bonferroni")) %>%
dplyr::arrange(pval)
})
displayRes_enrich(fgsea_tr_wk$rRNA, "rRNA")
displayRes_enrich(fgsea_tr_wk$groupT.47D_DHT, "T-47D_DHT")
displayRes_enrich(fgsea_tr_wk$groupZR.75.1_DHT, "ZR-75-1_DHT")
save(
addInfo,
dgeList,
dgeFilt,
res_r,
res_t,
res_tr,
file = here::here(
"4_T47D_ZR75_DHT_StrippedSerum/R/output/4_1_DE.RData"
)
)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] fgsea_1.14.0 msigdbr_7.2.1 ggrepel_0.8.2
## [4] DT_0.14 corrplot_0.84 edgeR_3.30.3
## [7] limma_3.44.3 ensembldb_2.12.1 AnnotationFilter_1.12.0
## [10] GenomicFeatures_1.40.1 AnnotationDbi_1.50.1 Biobase_2.48.0
## [13] GenomicRanges_1.40.0 GenomeInfoDb_1.24.2 IRanges_2.22.2
## [16] S4Vectors_0.26.1 AnnotationHub_2.20.0 BiocFileCache_1.12.0
## [19] dbplyr_1.4.4 kableExtra_1.1.0 ggpubr_0.4.0
## [22] scales_1.1.1 here_0.1 ngsReports_1.4.2
## [25] BiocGenerics_0.34.0 magrittr_1.5 forcats_0.5.0
## [28] stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4
## [31] readr_1.3.1 tidyr_1.1.0 tibble_3.0.3
## [34] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 RSQLite_2.2.0
## [3] htmlwidgets_1.5.1 FactoMineR_2.3
## [5] grid_4.0.3 BiocParallel_1.22.0
## [7] munsell_0.5.0 withr_2.2.0
## [9] colorspace_1.4-1 highr_0.8
## [11] knitr_1.29 rstudioapi_0.11
## [13] leaps_3.1 ggsignif_0.6.0
## [15] labeling_0.3 GenomeInfoDbData_1.2.3
## [17] hwriter_1.3.2 bit64_0.9-7.1
## [19] farver_2.0.3 rprojroot_1.3-2
## [21] vctrs_0.3.2 generics_0.0.2
## [23] xfun_0.15 R6_2.4.1
## [25] locfit_1.5-9.4 bitops_1.0-6
## [27] DelayedArray_0.14.1 assertthat_0.2.1
## [29] promises_1.1.1 gtable_0.3.0
## [31] Cairo_1.5-12.2 rlang_0.4.7
## [33] scatterplot3d_0.3-41 splines_4.0.3
## [35] rtracklayer_1.48.0 rstatix_0.6.0
## [37] lazyeval_0.2.2 broom_0.7.0
## [39] BiocManager_1.30.10 yaml_2.2.1
## [41] reshape2_1.4.4 abind_1.4-5
## [43] modelr_0.1.8 crosstalk_1.1.0.1
## [45] backports_1.1.8 httpuv_1.5.4
## [47] tools_4.0.3 ellipsis_0.3.1
## [49] RColorBrewer_1.1-2 ggdendro_0.1.22
## [51] Rcpp_1.0.5 plyr_1.8.6
## [53] progress_1.2.2 zlibbioc_1.34.0
## [55] RCurl_1.98-1.2 prettyunits_1.1.1
## [57] openssl_1.4.2 cowplot_1.0.0
## [59] zoo_1.8-8 SummarizedExperiment_1.18.2
## [61] haven_2.3.1 cluster_2.1.0
## [63] fs_1.4.2 data.table_1.12.8
## [65] openxlsx_4.1.5 reprex_0.3.0
## [67] truncnorm_1.0-8 ProtGenerics_1.20.0
## [69] matrixStats_0.56.0 hms_0.5.3
## [71] mime_0.9 evaluate_0.14
## [73] xtable_1.8-4 XML_3.99-0.4
## [75] rio_0.5.16 jpeg_0.1-8.1
## [77] readxl_1.3.1 gridExtra_2.3
## [79] compiler_4.0.3 biomaRt_2.44.1
## [81] crayon_1.3.4 htmltools_0.5.0
## [83] mgcv_1.8-33 later_1.1.0.1
## [85] lubridate_1.7.9 DBI_1.1.0
## [87] MASS_7.3-53 rappdirs_0.3.1
## [89] ShortRead_1.46.0 Matrix_1.2-18
## [91] car_3.0-8 cli_2.0.2
## [93] pkgconfig_2.0.3 flashClust_1.01-2
## [95] GenomicAlignments_1.24.0 foreign_0.8-80
## [97] plotly_4.9.2.1 xml2_1.3.2
## [99] webshot_0.5.2 XVector_0.28.0
## [101] rvest_0.3.5 digest_0.6.25
## [103] Biostrings_2.56.0 rmarkdown_2.3
## [105] cellranger_1.1.0 fastmatch_1.1-0
## [107] curl_4.3 shiny_1.5.0
## [109] Rsamtools_2.4.0 lifecycle_0.2.0
## [111] nlme_3.1-149 jsonlite_1.7.0
## [113] carData_3.0-4 viridisLite_0.3.0
## [115] askpass_1.1 fansi_0.4.1
## [117] pillar_1.4.6 lattice_0.20-41
## [119] fastmap_1.0.1 httr_1.4.1
## [121] interactiveDisplayBase_1.26.3 glue_1.4.1
## [123] zip_2.0.4 png_0.1-7
## [125] pander_0.6.3 BiocVersion_3.11.1
## [127] bit_1.1-15.2 stringi_1.4.6
## [129] blob_1.2.1 latticeExtra_0.6-29
## [131] memoise_1.1.0